{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Categorical Feature Encoding Challenge\n",
    "\n",
    "这是 kaggle 上的一个练习赛，地址在 [这里](https://www.kaggle.com/c/cat-in-the-dat/overview)，给出了一个全部由 category 特征构成的数据集。通过玩这个项目，可以了解到如何处理 category 特征，以及维度很高的时候，如何使用稀疏向量来存储数据。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "数据分析\n",
    "\"\"\"\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "train = pd.read_csv('../data/cat-in-the-dat/train.csv')\n",
    "test = pd.read_csv('../data/cat-in-the-dat/test.csv')\n",
    "\n",
    "train_copy, test_copy = train.copy(), test.copy()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 观察数据\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((300000, 25), (200000, 24))"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train.shape, test.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>bin_0</th>\n",
       "      <th>bin_1</th>\n",
       "      <th>bin_2</th>\n",
       "      <th>bin_3</th>\n",
       "      <th>bin_4</th>\n",
       "      <th>nom_0</th>\n",
       "      <th>nom_1</th>\n",
       "      <th>nom_2</th>\n",
       "      <th>nom_3</th>\n",
       "      <th>nom_4</th>\n",
       "      <th>nom_5</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>T</td>\n",
       "      <td>Y</td>\n",
       "      <td>Green</td>\n",
       "      <td>Triangle</td>\n",
       "      <td>Snake</td>\n",
       "      <td>Finland</td>\n",
       "      <td>Bassoon</td>\n",
       "      <td>50f116bcf</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>T</td>\n",
       "      <td>Y</td>\n",
       "      <td>Green</td>\n",
       "      <td>Trapezoid</td>\n",
       "      <td>Hamster</td>\n",
       "      <td>Russia</td>\n",
       "      <td>Piano</td>\n",
       "      <td>b3b4d25d0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>F</td>\n",
       "      <td>Y</td>\n",
       "      <td>Blue</td>\n",
       "      <td>Trapezoid</td>\n",
       "      <td>Lion</td>\n",
       "      <td>Russia</td>\n",
       "      <td>Theremin</td>\n",
       "      <td>3263bdce5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>F</td>\n",
       "      <td>Y</td>\n",
       "      <td>Red</td>\n",
       "      <td>Trapezoid</td>\n",
       "      <td>Snake</td>\n",
       "      <td>Canada</td>\n",
       "      <td>Oboe</td>\n",
       "      <td>f12246592</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>F</td>\n",
       "      <td>N</td>\n",
       "      <td>Red</td>\n",
       "      <td>Trapezoid</td>\n",
       "      <td>Lion</td>\n",
       "      <td>Canada</td>\n",
       "      <td>Oboe</td>\n",
       "      <td>5b0f5acd5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   id  bin_0  bin_1  bin_2 bin_3 bin_4  nom_0      nom_1    nom_2    nom_3  \\\n",
       "0   0      0      0      0     T     Y  Green   Triangle    Snake  Finland   \n",
       "1   1      0      1      0     T     Y  Green  Trapezoid  Hamster   Russia   \n",
       "2   2      0      0      0     F     Y   Blue  Trapezoid     Lion   Russia   \n",
       "3   3      0      1      0     F     Y    Red  Trapezoid    Snake   Canada   \n",
       "4   4      0      0      0     F     N    Red  Trapezoid     Lion   Canada   \n",
       "\n",
       "      nom_4      nom_5  \n",
       "0   Bassoon  50f116bcf  \n",
       "1     Piano  b3b4d25d0  \n",
       "2  Theremin  3263bdce5  \n",
       "3      Oboe  f12246592  \n",
       "4      Oboe  5b0f5acd5  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train.iloc[:,:12].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>nom_6</th>\n",
       "      <th>nom_7</th>\n",
       "      <th>nom_8</th>\n",
       "      <th>nom_9</th>\n",
       "      <th>ord_0</th>\n",
       "      <th>ord_1</th>\n",
       "      <th>ord_2</th>\n",
       "      <th>ord_3</th>\n",
       "      <th>ord_4</th>\n",
       "      <th>ord_5</th>\n",
       "      <th>day</th>\n",
       "      <th>month</th>\n",
       "      <th>target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>3ac1b8814</td>\n",
       "      <td>68f6ad3e9</td>\n",
       "      <td>c389000ab</td>\n",
       "      <td>2f4cb3d51</td>\n",
       "      <td>2</td>\n",
       "      <td>Grandmaster</td>\n",
       "      <td>Cold</td>\n",
       "      <td>h</td>\n",
       "      <td>D</td>\n",
       "      <td>kr</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>fbcb50fc1</td>\n",
       "      <td>3b6dd5612</td>\n",
       "      <td>4cd920251</td>\n",
       "      <td>f83c56c21</td>\n",
       "      <td>1</td>\n",
       "      <td>Grandmaster</td>\n",
       "      <td>Hot</td>\n",
       "      <td>a</td>\n",
       "      <td>A</td>\n",
       "      <td>bF</td>\n",
       "      <td>7</td>\n",
       "      <td>8</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0922e3cb8</td>\n",
       "      <td>a6a36f527</td>\n",
       "      <td>de9c9f684</td>\n",
       "      <td>ae6800dd0</td>\n",
       "      <td>1</td>\n",
       "      <td>Expert</td>\n",
       "      <td>Lava Hot</td>\n",
       "      <td>h</td>\n",
       "      <td>R</td>\n",
       "      <td>Jc</td>\n",
       "      <td>7</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>50d7ad46a</td>\n",
       "      <td>ec69236eb</td>\n",
       "      <td>4ade6ab69</td>\n",
       "      <td>8270f0d71</td>\n",
       "      <td>1</td>\n",
       "      <td>Grandmaster</td>\n",
       "      <td>Boiling Hot</td>\n",
       "      <td>i</td>\n",
       "      <td>D</td>\n",
       "      <td>kW</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1fe17a1fd</td>\n",
       "      <td>04ddac2be</td>\n",
       "      <td>cb43ab175</td>\n",
       "      <td>b164b72a7</td>\n",
       "      <td>1</td>\n",
       "      <td>Grandmaster</td>\n",
       "      <td>Freezing</td>\n",
       "      <td>a</td>\n",
       "      <td>R</td>\n",
       "      <td>qP</td>\n",
       "      <td>7</td>\n",
       "      <td>8</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       nom_6      nom_7      nom_8      nom_9  ord_0        ord_1  \\\n",
       "0  3ac1b8814  68f6ad3e9  c389000ab  2f4cb3d51      2  Grandmaster   \n",
       "1  fbcb50fc1  3b6dd5612  4cd920251  f83c56c21      1  Grandmaster   \n",
       "2  0922e3cb8  a6a36f527  de9c9f684  ae6800dd0      1       Expert   \n",
       "3  50d7ad46a  ec69236eb  4ade6ab69  8270f0d71      1  Grandmaster   \n",
       "4  1fe17a1fd  04ddac2be  cb43ab175  b164b72a7      1  Grandmaster   \n",
       "\n",
       "         ord_2 ord_3 ord_4 ord_5  day  month  target  \n",
       "0         Cold     h     D    kr    2      2       0  \n",
       "1          Hot     a     A    bF    7      8       0  \n",
       "2     Lava Hot     h     R    Jc    7      2       0  \n",
       "3  Boiling Hot     i     D    kW    2      1       1  \n",
       "4     Freezing     a     R    qP    7      8       0  "
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train.iloc[:,12:].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['id', 'bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4', 'nom_0',\n",
       "       'nom_1', 'nom_2', 'nom_3', 'nom_4', 'nom_5', 'nom_6', 'nom_7',\n",
       "       'nom_8', 'nom_9', 'ord_0', 'ord_1', 'ord_2', 'ord_3', 'ord_4',\n",
       "       'ord_5', 'day', 'month', 'target'], dtype=object)"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train.columns.values"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "所有以 `bin_` 开头的属性是二值的。`nom_` 开头的属性是枚举值。`ord_` 开头的属性也是枚举值，但是存在顺序关系，比如 `ord_2` 这一列描述的是温度，温度是有高低关系的。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "nom_0 has 3 unique values\n",
      "nom_1 has 6 unique values\n",
      "nom_2 has 6 unique values\n",
      "nom_3 has 6 unique values\n",
      "nom_4 has 4 unique values\n",
      "nom_5 has 222 unique values\n",
      "nom_6 has 522 unique values\n",
      "nom_7 has 1220 unique values\n",
      "nom_8 has 2215 unique values\n",
      "nom_9 has 11981 unique values\n"
     ]
    }
   ],
   "source": [
    "for i in range(10):\n",
    "    count = train.loc[:, 'nom_{}'.format(i)].unique().shape[0]\n",
    "    print(\"nom_{} has {} unique values\".format(i, count))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ord_0 has 3 unique values\n",
      "ord_1 has 5 unique values\n",
      "ord_2 has 6 unique values\n",
      "ord_3 has 15 unique values\n",
      "ord_4 has 26 unique values\n",
      "ord_5 has 192 unique values\n"
     ]
    }
   ],
   "source": [
    "for i in range(6):\n",
    "    count = train.loc[:, 'ord_{}'.format(i)].unique().shape[0]\n",
    "    print(\"ord_{} has {} unique values\".format(i, count))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 300000 entries, 0 to 299999\n",
      "Data columns (total 25 columns):\n",
      "id        300000 non-null int64\n",
      "bin_0     300000 non-null int64\n",
      "bin_1     300000 non-null int64\n",
      "bin_2     300000 non-null int64\n",
      "bin_3     300000 non-null object\n",
      "bin_4     300000 non-null object\n",
      "nom_0     300000 non-null object\n",
      "nom_1     300000 non-null object\n",
      "nom_2     300000 non-null object\n",
      "nom_3     300000 non-null object\n",
      "nom_4     300000 non-null object\n",
      "nom_5     300000 non-null object\n",
      "nom_6     300000 non-null object\n",
      "nom_7     300000 non-null object\n",
      "nom_8     300000 non-null object\n",
      "nom_9     300000 non-null object\n",
      "ord_0     300000 non-null int64\n",
      "ord_1     300000 non-null object\n",
      "ord_2     300000 non-null object\n",
      "ord_3     300000 non-null object\n",
      "ord_4     300000 non-null object\n",
      "ord_5     300000 non-null object\n",
      "day       300000 non-null int64\n",
      "month     300000 non-null int64\n",
      "target    300000 non-null int64\n",
      "dtypes: int64(8), object(17)\n",
      "memory usage: 57.2+ MB\n",
      "None\n",
      "----------------------------------------\n",
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 200000 entries, 0 to 199999\n",
      "Data columns (total 24 columns):\n",
      "id       200000 non-null int64\n",
      "bin_0    200000 non-null int64\n",
      "bin_1    200000 non-null int64\n",
      "bin_2    200000 non-null int64\n",
      "bin_3    200000 non-null object\n",
      "bin_4    200000 non-null object\n",
      "nom_0    200000 non-null object\n",
      "nom_1    200000 non-null object\n",
      "nom_2    200000 non-null object\n",
      "nom_3    200000 non-null object\n",
      "nom_4    200000 non-null object\n",
      "nom_5    200000 non-null object\n",
      "nom_6    200000 non-null object\n",
      "nom_7    200000 non-null object\n",
      "nom_8    200000 non-null object\n",
      "nom_9    200000 non-null object\n",
      "ord_0    200000 non-null int64\n",
      "ord_1    200000 non-null object\n",
      "ord_2    200000 non-null object\n",
      "ord_3    200000 non-null object\n",
      "ord_4    200000 non-null object\n",
      "ord_5    200000 non-null object\n",
      "day      200000 non-null int64\n",
      "month    200000 non-null int64\n",
      "dtypes: int64(7), object(17)\n",
      "memory usage: 36.6+ MB\n",
      "None\n"
     ]
    }
   ],
   "source": [
    "print(train.info())\n",
    "print(\"-\" * 40)\n",
    "print(test.info())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "经过分析后，所有属性都是离散的，不存在缺失值，我觉得可以把所有属性都做 one-hot 编码即可。但是考虑到 `ord_` 属性是存在顺序关系的，可以尝试把 `ord_` 映射到 `0-1` 之间。后面分别尝试这两种方案。\n",
    "\n",
    "## 方案一\n",
    "\n",
    "所有属性都做 one-hot 编码。\n",
    "\n",
    "### 数据预处理\n",
    "\n",
    "将所有特征都做 one-hot 编码即可，每一个样本为一个 16552 维的稀疏向量。day 和 month 组合起来可以是一年中具体的某一天，可以构造出这样一个特征来。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "train = train_copy.copy()\n",
    "test = test_copy.copy()\n",
    "\n",
    "target = train['target']\n",
    "train = train.drop(['target'], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset = pd.concat([train, test], ignore_index=True)\n",
    "dataset = dataset.drop(['id'], axis=1)\n",
    "dataset['date'] = dataset['month'].astype(np.str) + '-' + dataset['day'].astype(np.str)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "大约要执行一分钟\n",
    "\"\"\"\n",
    "X = pd.get_dummies(dataset, columns=dataset.columns, sparse=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`pandas.get_dummies` 加 `sparse=True` 返回的是 DataFrame，可以使用 `to_coo` 方法得到稀疏矩阵。但是这里为了分离出训练集和测试集，需要在将 coo 矩阵转为 csr 矩阵，这样才能做行切片。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = X.to_coo().tocsr()\n",
    "X_train = X[:train.shape[0]]\n",
    "X_test = X[train.shape[0]:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((300000, 16636), (200000, 16636))"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train.shape, X_test.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 训练模型\n",
    "\n",
    "这里数据量虽然很大，但是因为使用的是稀疏向量，因此训练 logistics regression 还是会很快。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0.80145132, 0.80058547, 0.80766639, 0.80311128, 0.80372911])"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.model_selection import cross_val_score\n",
    "\n",
    "lr = LogisticRegression(solver='lbfgs', C=0.1)\n",
    "\n",
    "cross_val_score(lr, X_train, target, cv=5, scoring='roc_auc', n_jobs=-1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 预测"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "lr.fit(X_train, target)\n",
    "\n",
    "y_proba = lr.predict_proba(X_test)\n",
    "\n",
    "submission = pd.DataFrame({\n",
    "    \"id\": test['id'],\n",
    "    \"target\": y_proba[:, 0]\n",
    "})\n",
    "\n",
    "submission.to_csv(\"../data/cat-in-the-dat/submission.csv\", index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "提交到 kaggle 上之后得分 0.80780，排行榜上 56/347。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 方案二\n",
    "\n",
    "除了 `ord_` 属性外都做 one-hot 编码，`ord_` 属性转换为 0-1 之间的值。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [],
   "source": [
    "train = train_copy.copy()\n",
    "test = test_copy.copy()\n",
    "\n",
    "target = train['target']\n",
    "train = train.drop(['target'], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset = pd.concat([train, test], ignore_index=True)\n",
    "dataset['date'] = dataset['month'].astype(np.str) + '-' + dataset['day'].astype(np.str)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(500000, 25)"
      ]
     },
     "execution_count": 66,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.compose import ColumnTransformer\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.preprocessing import LabelBinarizer, OneHotEncoder, OrdinalEncoder, MinMaxScaler\n",
    "\n",
    "def make_pipeline(estimator_list):\n",
    "    return Pipeline([\n",
    "        (estimator.__class__.__name__ + str(i), estimator)\n",
    "        for i,estimator  in enumerate(estimator_list)\n",
    "    ])\n",
    "\n",
    "full_pipeline =  ColumnTransformer([\n",
    "    (\"nom_*\", OneHotEncoder(), ['nom_'+str(i) for i in range(0, 10)]),\n",
    "    (\"day/month\", OneHotEncoder(), ['day','month']),\n",
    "    (\"ord_0\", make_pipeline([\n",
    "        OrdinalEncoder(),\n",
    "        MinMaxScaler()\n",
    "    ]), [\"ord_0\", \"ord_3\", \"ord_4\", \"ord_5\", \"ord_6\"]),\n",
    "    (\"ord_1\", make_pipeline([\n",
    "        OrdinalEncoder(categories=[['Novice','Contributor','Expert','Master','Grandmaster']]),\n",
    "        MinMaxScaler()\n",
    "    ]), [\"ord_1\"]),\n",
    "    (\"ord_2\", make_pipeline([\n",
    "        OrdinalEncoder(categories=[['Freezing','Cold','Warm', 'Hot', 'Boiling Hot', 'Lava Hot']]),\n",
    "        MinMaxScaler()\n",
    "    ]), [\"ord_2\"]),\n",
    "], remainder='passthrough')\n",
    "\n",
    "for df in [train, test]:\n",
    "    df.drop(['id'], axis=1, inplace=True)\n",
    "    df['bin_3'] = df['bin_3'].map({'F': 0, 'T': 1})\n",
    "    df['bin_4'] = df['bin_4'].map({'N': 0, 'Y': 1})\n",
    "    df['ord_6'] = df['ord_5'].str[1]\n",
    "    df['ord_5'] = df['ord_5'].str[0]\n",
    "\n",
    "dataset = pd.concat([train, test], ignore_index=True)\n",
    "    \n",
    "full_pipeline.fit(dataset)\n",
    "\n",
    "X_train = full_pipeline.transform(train)\n",
    "X_test = full_pipeline.transform(test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0.80179762, 0.80051735, 0.8070874 , 0.80310585, 0.80400994])"
      ]
     },
     "execution_count": 70,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.model_selection import train_test_split, cross_val_score\n",
    "\n",
    "lr = LogisticRegression(solver='lbfgs', C=0.2)\n",
    "\n",
    "cross_val_score(lr, X_train, target, cv=5, scoring='roc_auc', n_jobs=-1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "预测\n",
    "\"\"\"\n",
    "\n",
    "lr.fit(X_train, target)\n",
    "\n",
    "y_proba = lr.predict_proba(X_test)\n",
    "\n",
    "submission = pd.DataFrame({\n",
    "    \"id\": test_copy['id'],\n",
    "    \"target\": y_proba[:, 0]\n",
    "})\n",
    "\n",
    "submission.to_csv(\"../data/cat-in-the-dat/submission_order.csv\", index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
